# -*- coding: utf-8 -*-

#author:Keengle(http://www.kgblog.net)
from spdUtility import PriorityQueue,Parser
import urllib2
import sys
import os

def updatePriQueue( priQueue, url ):
    "更新优先级队列"
    extraPrior = url.endswith('.html') and 2 or 0 #这里优先下载以html结尾的url
    extraMyBlog = 'www.itlong.com' in url and 5 or 0 #优先抓取含有指定内容的网页，竞价抓取排名？？
    item = priQueue.getitem(url)

    if item :
        newitem = ( item[0]+1+extraPrior+extraMyBlog, item[1] )
        priQueue.remove(item)
        priQueue.push( newitem )
    else :
        priQueue.push( (1+extraPrior+extraMyBlog,url) )

def getmainurl(url):
    "获得该url的主站地址，用于添加在相对url地址的开头"
    ix = url.find('/',len('http://') )
    if ix > 0 :
        return url[:ix]
    else :
        return url

def analyseHtml(url,html, priQueue,downlist):
    "分析html的超链接，并更新优先级队列"
    p = Parser()
    try :
        p.feed(html)
        p.close()
    except:
        return

    mainurl = getmainurl(url)
    for k, v in p.anchors.items():
        for u in v :
            if not u.startswith('http://'):  #处理相对地址的url
                u = mainurl + u       
            if not downlist.count(u) :    #如果该url已经下载，就不处理了
                updatePriQueue( priQueue, u )

def downloadUrl(id, url, priQueue , downlist,downFolder):
    "下载指定url内容，并分析html超链接"
    downFileName = downFolder+'/%d.html' % (id,)
    print 'downloading',url,'as', downFileName ,
    try:
        fp = urllib2.urlopen(url)
    except:
        print '[ failed ]'
        return False

    else :
        print '[ success ]'
        downlist.push( url )  #把已下载的url添加到列表中
     op = open(downFileName,"wb")

        html = fp.read()
        op.write( html )
        op.close()
        fp.close()
        analyseHtml(url,html,priQueue,downlist)
        return True

def spider(beginurl, pages,downFolder):
    "爬虫主程序，循环从优先级队列中取出最高优先级的结点处理"
    priQueue = PriorityQueue()
    downlist = PriorityQueue() #已下载url的集合，防止重复下载
    priQueue.push( (1,beginurl) )
    i = 0

    while not priQueue.empty() and i < pages :
        k, url = priQueue.pop()
        if downloadUrl(i+1, url, priQueue , downlist,downFolder):
            i += 1
    print '\nDownload',i,'pages, Totally.'

def main():
    "主函数，设定相关参数：开始url，抓取的网页数目，保存的文件夹"
    beginurl = 'http://www.itlong.com'  #开始抓取的URL地址
    pages = 20   #抓取网页的数目
    downloadFolder = './spiderDown' #指定保存网页的文件夹
    if not os.path.isdir( downloadFolder ):
        os.mkdir( downloadFolder )
    spider( beginurl, pages, downloadFolder)

if __name__ == '__main__':
    main()

#filename:spdUtility.py
# -*- coding: utf-8 -*-
import bisect
import string
import htmllib
import formatter

class PriorityQueue(list):
    "优先级队列,用于存储url,及它的优先级"
    def __init__(self):
        list.__init__(self)
        self.map = {}

    def push(self, item):
        # 按顺序插入，防止重复元素;若要按升序排列，可使用bisect.insort_left
        if self.count(item) == 0:
            bisect.insort(self, item)
            self.map[ item[1] ] = item

    def pop(self):
        r = list.pop(self)
        del self.map[ r[1] ]
        return r
    
    def getitem(self,url):
        if self.map.has_key( url ):
            return self.map
        else :
            return None
       
    def empty(self):
        return len(self) == 0
   
    def remove(self,item):
        list.remove(self, item)
        del self.map[ item[1] ]
    def count(self,item):
        if len(self) == 0 :
            return 0

        #二分查找
        left = 0
        right = len(self)-1
        mid = -1
        while left <= right:
            mid = (left+right)/2
            if self[mid] < item :
                left = mid + 1
            elif self[mid] > item :
                right = mid -1
            else :
                break
        return self[mid] == item and 1 or 0

class Parser(htmllib.HTMLParser):
    # HTML分析类    
    def __init__(self, verbose=0):
        self.anchors = {}
        f = formatter.NullFormatter()
        htmllib.HTMLParser.__init__(self, f, verbose)  

    def anchor_bgn(self, href, name, type):
        self.save_bgn()
        self.anchor = href

    def anchor_end(self):
        text = string.strip(self.save_end())
        if self.anchor and text:
            self.anchors[text] = self.anchors.get(text, []) + [self.anchor]           

def main(): #just for test
    pq = PriorityQueue()
    # add items out of order
    pq.push( (1,'http://www.baidu.com') )
    pq.push( (2,'http://www.sina.com') )
    pq.push( (3,'http://www.google.com') )
    pq.push( (1,'http://www.163.com') )   

    item = pq.getitem('http://www.sina.com')
    print item
    print pq.count(item)
    pq.remove( item )
    print pq.count(item)

    # print queue contents
    while not pq.empty():
        print pq.pop()     

if __name__ == '__main__':
    main()
